Load Data and Packages

#install.packages("JGL")
#install.packages("JointNets")
#install.packages("tidyverse")

library(JGL)
## Loading required package: igraph
## Warning: package 'igraph' was built under R version 4.1.2
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(JointNets) # for plotting JGL results 
## Loading required package: lpSolve
## Loading required package: pcaPP
## Loading required package: parallel
## Registered S3 method overwritten by 'JointNets':
##   method   from
##   plot.jgl JGL
## 
## Attaching package: 'JointNets'
## The following object is masked from 'package:JGL':
## 
##     plot.jgl
## The following object is masked from 'package:stats':
## 
##     BIC
library(igraph)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::as_data_frame() masks tibble::as_data_frame(), igraph::as_data_frame()
## x purrr::compose()       masks igraph::compose()
## x tidyr::crossing()      masks igraph::crossing()
## x dplyr::filter()        masks stats::filter()
## x dplyr::groups()        masks igraph::groups()
## x dplyr::lag()           masks stats::lag()
## x purrr::simplify()      masks igraph::simplify()
#source(here::here("code/helper_functions.R"))
source(here::here("code/visualize.r"))
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
theme_set(theme_bw())

# read in data from Raji
dat <- read.csv(here::here("data/hapo_metabolomics_2020.csv"))

The data consists of id, K = 4 ancestry groups (n = 400 each), and metabolites (p = 51 features). There is also a variable “fpg” which is fasting plasma glucose, I will omit that for now.

Let’s first prepare the data by centering and scaling each metabolite feature (within class).

summary(as.factor(dat$anc_gp))
## ag1 ag2 ag3 ag4 
## 400 400 400 400

We can see that the raw data has different distributions, but assumptions of JGL need data to be centered and scaled.

head(dat)[,1:10]
##       id anc_gp  fpg    mt1_1     mt1_2    mt1_3    mt1_4     mt1_5    mt1_6
## 1 hm0001    ag3 75.6 218.2223  76.99525 19.06366 14.23091  86.75162 135.2109
## 2 hm0002    ag3 84.6 292.6314 136.41320 43.14854 17.77549 120.17344 213.6531
## 3 hm0003    ag4 79.2 361.1135  79.98370 22.15848 13.05497  74.75441 136.1587
## 4 hm0004    ag3 73.8 274.0428  79.76154 19.72682 12.68744  69.34854 146.7262
## 5 hm0005    ag1 91.8 270.9049  88.98260 39.20949 15.73498  95.56857 145.2876
## 6 hm0006    ag4 73.8 271.4034  99.63992 27.81137 20.91344  95.55926 223.7438
##      mt1_7
## 1 64.00578
## 2 91.30156
## 3 83.67878
## 4 72.67280
## 5 48.49431
## 6 68.94172
summary(dat$mt1_1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   115.6   281.8   322.4   328.5   370.6   660.4
summary(dat$mt2_1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -4.631  -3.341  -3.040  -3.044  -2.750  -1.154       1
summary(dat$mt3_1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   17.28   21.39   22.64   22.96   24.64   27.25      11
hist(dat$mt1_1)

hist(dat$mt2_1)

hist(dat$mt3_1)

# prepare data into list of K datasets

# make data into long format then center and scale by metabolite and group. 
dat_long <- dat %>%
  select(-fpg) %>%
  pivot_longer(cols = starts_with("mt"),
               names_to = "metabolite",
               values_to = "value",
               values_drop_na = TRUE) %>%
  group_by(anc_gp, metabolite) %>%
  mutate(scaled_value = scale(value, center = TRUE, scale = TRUE)) %>%
  ungroup()

dat_long %>% 
  filter(metabolite %in% c("mt1_1", "mt2_1", "mt3_1")) %>%
  ggplot(aes(x=scaled_value, color=anc_gp)) +
    geom_density()+
  facet_grid(metabolite ~.)

Now it’s ready! But first we need to get it into K n by p matrices as required by JGL package.

?? Should we do any imputation?

# make list of ancestry groups
ancestry_groups <- sort(unique(dat$anc_gp))

# filter by ancestry group, then make wide matrix of scaled values
# create a list of the results
dat_mat_list <- map(ancestry_groups,
                    ~dat_long %>%
                      filter(anc_gp == .x) %>%
                      select(-c(anc_gp, value)) %>%
                      pivot_wider(names_from = metabolite, 
                                  values_from = scaled_value,
                                  #for simplicity putting 0 as missing values. Normally should do imputation.
                                  values_fill = 0) %>%
                      select(-id) %>%
                      as.matrix())

names(dat_mat_list) <- ancestry_groups

str(dat_mat_list)
## List of 4
##  $ ag1: num [1:400, 1:51] -1.154 -0.675 -1.182 1.211 0.6 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##  $ ag2: num [1:400, 1:51] -0.145 0.379 -1.474 -1.313 -0.897 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##  $ ag3: num [1:400, 1:51] -1.269 0.329 -0.07 1.317 0.259 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##  $ ag4: num [1:400, 1:51] 0.119 -1.257 0.131 -0.346 -0.541 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...

So now we have a list of 4 matrices, each with 400 standardized observations of 51 metabolites.

We can get to joint graphical lasso now!

fgl_results = JGL(Y = dat_mat_list,
                  penalty = "fused",
                  lambda1 = .08,
                  lambda2 = 0.02,
                  return.whole.theta = FALSE)

str(fgl_results)
## List of 3
##  $ theta            :List of 4
##   ..$ : num [1:51, 1:51] 1.37 0 0 0 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##   .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##   ..$ : num [1:51, 1:51] 1.37 0 0 0 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##   .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##   ..$ : num [1:51, 1:51] 1.37 0 0 0 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##   .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##   ..$ : num [1:51, 1:51] 1.37 0 0 0 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##   .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##  $ theta.unconnected: NULL
##  $ connected        : logi [1:51] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  - attr(*, "class")= chr "jgl"
print.jgl(fgl_results)
## 
## Number of connected nodes:  51 
## Number of subnetworks in each class:  1 1 1 1 
## Number of edges in each class:  294 300 272 292 
## Number of edges shared by all classes:  214 
## L1 norm of off-diagonal elements of classes' Thetas:  75.25031 75.50375 72.97726 72.85025
# extract all estimated covariance matrices from result
inv_covar_matrices <- fgl_results$theta
names(inv_covar_matrices) <- ancestry_groups

#now use function from igraph to create igraph graphs from adjacency matrices

graph_list <- map(ancestry_groups,
                  ~graph_from_adjacency_matrix(
                    -cov2cor(inv_covar_matrices[[.x]]),
                    weighted = T,
                    mode = "undirected",
                    diag = FALSE
                  ))
names(graph_list) <- ancestry_groups

plot.igraph(graph_list[["ag1"]],
            layout = layout.fruchterman.reingold)

plot_jgl(graph_list[[1]], multiplier = 3)

plot_jgl(graph_list[[2]], multiplier = 3)

plot_jgl(graph_list[[3]], multiplier = 3)

plot_jgl(graph_list[[4]], multiplier = 3)

Now run ggl

## run ggl:
ggl_results = JGL(Y=dat_mat_list,
                  penalty="group",
                  lambda1=.15,
                  lambda2=.2,
                  return.whole.theta=TRUE)
str(ggl_results)
## List of 2
##  $ theta    :List of 4
##   ..$ : num [1:51, 1:51] 1.09 0 0 0 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##   .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##   ..$ : num [1:51, 1:51] 1.05 0 0 0 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##   .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##   ..$ : num [1:51, 1:51] 1.1 0 0 0 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##   .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##   ..$ : num [1:51, 1:51] 1.1 0 0 0 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##   .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
##  $ connected: logi [1:51] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  - attr(*, "class")= chr "jgl"
print.jgl(ggl_results)
## 
## Number of connected nodes:  48 
## Number of subnetworks in each class:  1 1 1 1 
## Number of edges in each class:  146 146 146 155 
## Number of edges shared by all classes:  130 
## L1 norm of off-diagonal elements of classes' Thetas:  36.99183 34.28299 37.9018 35.52136
# extract all estimated covariance matrices from result
ggl_inv_covar_matrices <- ggl_results$theta
names(ggl_inv_covar_matrices) <- ancestry_groups

#now use function from igraph to create igraph graphs from adjacency matrices

ggl_graph_list <- map(ancestry_groups,
                  ~graph_from_adjacency_matrix(
                    -cov2cor(ggl_inv_covar_matrices[[.x]]),
                    weighted = T,
                    mode = "undirected",
                    diag = FALSE
                  ))
names(ggl_graph_list) <- ancestry_groups


plot_jgl(ggl_graph_list[[1]], multiplier = 3)

plot_jgl(ggl_graph_list[[2]], multiplier = 3)

plot_jgl(ggl_graph_list[[3]], multiplier = 3)

plot_jgl(ggl_graph_list[[4]], multiplier = 3)

BUT what we’ve done so far we’ve had to pre-select the lambdas.

Let’s instead do a